Here is the report of the network analysis of the crop health data.
library(dplyr)
library(plyr)
library(reshape)
library(reshape2)
library(gridExtra)
library(lubridate)
library(doBy)
library(cluster)
library(ggplot2)
library(scales)
library(bioDist)
library(vegan)
library(mvtnorm)
library(igraph)
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.47 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
library(cowplot)
library(DiffCorr)
library(pheatmap)
library(RCurl) # run this package for load the data form the website
file <- getURL("https://docs.google.com/spreadsheets/d/1zB7gNdI7Nk7SuHuPWcjzaKnjuwkvL6sOVMo0zMfuV-c/pub?gid=558862364&single=true&output=csv") # load data from the google drive
data <- read.csv(text = file) # read data which is formated as the csv
data[data == "-"] <- NA # replace '-' with NA
data[data == ""] <- NA # replace 'missing data' with NA
# ==== to lower variable names ====
names(data) <- tolower(names(data)) # for more consistancy
data <- data[, !names(data) %in% c("phase", "identifier", "village", "year", "season", "lat", "long",
"fa", "fn", "fp", "lfm", "ced", "cedjul", "hd", "hdjul", "cvr",
"varcoded", "fymcoded", "mu", "nplsqm", "rbpx")
]
#==== corract the variable type =====
data <- transform(data, country = as.factor(country), pc = as.factor(pc),
cem = as.factor(cem), ast = as.factor(ast), ccd = as.numeric(ccd),
vartype = as.factor(vartype), fym = as.character(fym), n = as.numeric(n),
p = as.numeric(p), k = as.numeric(k), mf = as.numeric(mf), wcp = as.factor(wcp),
iu = as.numeric(iu), hu = as.numeric(hu), fu = as.numeric(fu), cs = as.factor(cs),
ldg = as.numeric(ldg), yield = as.numeric(yield), dscum = as.factor(dscum),
wecum = as.factor(wecum), ntmax = as.numeric(ntmax), npmax = as.numeric(npmax),
nltmax = as.numeric(nltmax), nlhmax = as.numeric(nltmax), waa = as.numeric(waa),
wba = as.numeric(wba), dhx = as.numeric(dhx), whx = as.numeric(whx),
ssx = as.numeric(ssx), wma = as.numeric(wma), lfa = as.numeric(lfa),
lma = as.numeric(lma), rha = as.numeric(rha), thrx = as.numeric(thrx),
pmx = as.numeric(pmx), defa = as.numeric(defa), bphx = as.numeric(bphx),
wbpx = as.numeric(wbpx), awx = as.numeric(awx), rbx = as.numeric(rbx),
rbbx = as.numeric(rbbx), glhx = as.numeric(glhx), stbx = as.numeric(stbx),
hbx = as.numeric(hbx), bbx = as.numeric(bbx), blba = as.numeric(blba),
lba = as.numeric(lba), bsa = as.numeric(bsa), blsa = as.numeric(blsa),
nbsa = as.numeric(nbsa), rsa = as.numeric(rsa), lsa = as.numeric(lsa),
shbx = as.numeric(shbx), shrx = as.numeric(shrx), srx = as.numeric(srx),
fsmx = as.numeric(fsmx), nbx = as.numeric(nbx), dpx = as.numeric(dpx),
rtdx = as.numeric(rtdx), rsdx = as.numeric(rsdx), gsdx = as.numeric(gsdx),
rtx = as.numeric(rtx))
data$pc <- ifelse(data$pc == "rice", 1, 0)
# Crop establisment method
levels(data$cem)[levels(data$cem) == "trp"] <- 1
levels(data$cem)[levels(data$cem) == "TPR"] <- 1
levels(data$cem)[levels(data$cem) == "DSR"] <- 2
levels(data$cem)[levels(data$cem) == "dsr"] <- 2
# fym there are two type 0 and 1, raw data are recorded as no, yes, and value, if the value is 0 which mean 0 and if the value more than 0 which means 1
data$fym <- ifelse(data$fym == "no", 0,
ifelse(data$fym == "0", 0, 1
)
)
# vartype there are three type treditional varieties, modern varities and hybrid
data$vartype <- ifelse(data$vartype == "tv", 1,
ifelse(data$vartype == "mv", 2,
ifelse(data$vartype == "hyb", 3, NA
)
)
)
# wcp weed control management
levels(data$wcp)[levels(data$wcp) == "hand"] <- 1
levels(data$wcp)[levels(data$wcp) == "herb"] <- 2
levels(data$wcp)[levels(data$wcp) == "herb-hand"] <- 3
# Crop Status
levels(data$cs)[levels(data$cs) == "very poor"] <- 1
levels(data$cs)[levels(data$cs) == "poor"] <- 2
levels(data$cs)[levels(data$cs) == "average"] <- 3
levels(data$cs)[levels(data$cs) == "good"] <- 4
levels(data$cs)[levels(data$cs) == "very good"] <- 5
# clean the data
num.data <- apply(data[, -c(1,2)], 2, as.numeric)
num.data <- as.data.frame(as.matrix(num.data))
data <- cbind(data[1:2], num.data)
data <- data[,apply(data[, -c(1,2)], 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
data <- data[complete.cases(data),] # exclude row which cantain NA
#==== cluster analysis of the production sitatuon of the survey data ====
start.PS <- "pc"
end.PS <- "fu"
start.col.PS <- match(start.PS, names(data))
end.col.PS <- match(end.PS, names(data))
PS.data <- data[, c(1, start.col.PS:end.col.PS)]
# transform all variable to numeric type
# wss <- (nrow(PS.data)-1)* sum(apply(PS.data, 2, var))
# for (i in 2:15) wss[i] <- sum(kmeans(PS.data,
# centers=i)$withinss)
# plot(1:15, wss, type="b", xlab="Number of Clusters",
# ylab="Within groups sum of squares")
# distance matrix
dist.PS <- daisy(PS.data[-1])
## Warning in daisy(PS.data[-1]): binary variable(s) 1, 2, 6 treated as
## interval scaled
cluster.PS <- hclust(dist.PS, method = "average")
dendro.PS <- as.dendrogram(cluster.PS)
plot(dendro.PS, center = T, nodePar = list(lab.cex = 0.6,
lab.col = "black", pch = NA),
main = "Dendrogram for Production situation")
# draw retangles
rect.hclust(tree = cluster.PS, k=2, border = c("red", "blue"))
#number of members in each cluster
PS.no <- cutree(cluster.PS, k = 2)
# cophenitic correlation
rcluster.PS <- cophenetic(cluster.PS)
#cor(dist.PS, rcluster.PS)
pheatmap(t(PS.data[-1]), cluster_rows = FALSE, clustering_distance_rows = "daisy",clustering_method = "average" )
data <- cbind(data, PS.no)
data$PS.no <- as.factor(data$PS.no)
data %>% ggplot(aes(x = PS.no)) + geom_histogram() + facet_grid(~country)
name.country <- levels(data$country)
by.country.data <- list()
for(i in 1:length(name.country)){
temp <- data %>%
filter(country == name.country[i])
by.country.data[[i]] <- temp
}
# The profile of PS no
clus.PS.data <- data[, -c(1, 17:56)]
m.PS.data <- melt(clus.PS.data)
## Using country, PS.no as id variables
name.variable <- levels(m.PS.data$variable)
name.PS.no <- levels(m.PS.data$PS.no)
data %>% ggplot(aes(x = PS.no)) + geom_histogram() + facet_grid(~country)
#head(data)
# select only the variables related to the injury profiles
start.IP <- "dhx"
end.IP <- "rtx"
start.col.IP <- match(start.IP, names(data))
end.col.IP <- match(end.IP, names(data))
IP.data <- data[start.col.IP:end.col.IP]
IP.data <- IP.data[ ,apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
groups <- paste(data$country, data$PS.no, sep = "_")
IP.data <- cbind(groups, IP.data)
IP.data[is.na(IP.data)] <- 0
trts <- as.vector(unique(IP.data$groups))
#=====co_occurrence_pairwise.R====
results <- matrix(nrow = 0, ncol = 7)
options(warnings = -1)
for(a in 1:length(trts)){
#pull the first element from the vector of treatments
trt.temp <- trts[a]
#subset the dataset for those treatments
temp <- subset(IP.data, groups == trt.temp)
#in this case the community data started at column 6, so the loop for co-occurrence has to start at that point
for(b in 2:(dim(temp)[2]-1)){
#every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
for(c in (b+1):(dim(temp)[2])){
#summing the abundances of species of the columns that will be compared
species1.ab <- sum(temp[,b])
species2.ab <- sum(temp[,c])
#if the column is all 0's no co-occurrence will be performed
if(species1.ab >1 & species2.ab >1){
test <- cor.test(temp[,b], temp[,c], method = "spearman", na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if(species1.ab <=1 | species2.ab <= 1){
rho<-0
p.value<-1
}
new.row <- c(trts[a], names(temp)[b], names(temp)[c], rho, p.value, species1.ab, species2.ab)
results <- rbind(results, new.row)
}
}
print(a/length(trts))
}
## [1] 0.1111111
## [1] 0.2222222
## [1] 0.3333333
## [1] 0.4444444
## [1] 0.5555556
## [1] 0.6666667
## [1] 0.7777778
## [1] 0.8888889
## [1] 1
results<-data.frame(data.matrix(results))
names(results)<-c("trt","taxa1","taxa2","rho","p.value","ab1","ab2")
#making sure certain variables are factors
results$trt <- as.factor(results$trt)
results$taxa1 <- as.character(as.factor(results$taxa1))
results$taxa2 <- as.character(as.factor(results$taxa2))
results$rho <- as.numeric(as.character(results$rho))
results$p.value <- as.numeric(as.character(results$p.value))
results$ab1 <- as.numeric(as.character(results$ab1))
results$ab2 <- as.numeric(as.character(results$ab2))
str(results)
## 'data.frame': 2484 obs. of 7 variables:
## $ trt : Factor w/ 9 levels "IDN_1","IDN_2",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ taxa1 : chr "dhx" "dhx" "dhx" "dhx" ...
## $ taxa2 : chr "whx" "ssx" "defa" "bphx" ...
## $ rho : num 0.0652 -0.1227 0.1899 -0.117 -0.5074 ...
## $ p.value: num 0.689188 0.45061 0.240542 0.472163 0.000832 ...
## $ ab1 : num 1307 1307 1307 1307 1307 ...
## $ ab2 : num 2089 109 1941 1286 84 ...
head(results)
## trt taxa1 taxa2 rho p.value ab1 ab2
## 1 PHL_1 dhx whx 0.06524012 0.6891879742 1307 2089
## 2 PHL_1 dhx ssx -0.12272044 0.4506097948 1307 109
## 3 PHL_1 dhx defa 0.18989475 0.2405423584 1307 1941
## 4 PHL_1 dhx bphx -0.11699678 0.4721633481 1307 1286
## 5 PHL_1 dhx wbpx -0.50743562 0.0008316712 1307 84
## 6 PHL_1 dhx awx 0.00000000 1.0000000000 1307 1
results$newrho <- ifelse(abs(results$rho) > 0.2, results$rho, 0)
# IDN1 <- results %>% filter(trt == "IDN_1")
# IDN2 <- results %>% filter(trt == "IDN_2")
# IND1 <- results %>% filter(trt == "IND_1")
# IND2 <- results %>% filter(trt == "IND_2")
# THA1 <- results %>% filter(trt == "THA_1")
# THA2 <- results %>% filter(trt == "THA_2")
# VNM1 <- results %>% filter(trt == "VNM_1")
# VNM2 <- results %>% filter(trt == "VNM_1")
# PHL1 <- results %>% filter(trt == "PHL_1")
results.by.group <- list()
name.groups <- c("IDN_1", "IDN_2", "IND_1", "IND_2", "THA_1", "THA_2", "VNM_1", "VNM_2", "PHL_1") #sort(as.vector(unique(groups)))
for(i in 1: length(name.groups)){
results.by.group[[i]] <- subset(results, trt == name.groups[i])
}
# head(results_sub.by.group[[1]][,2:3]) # get the list
g <- list()
for(i in 1:length(name.groups)){
g[[i]] <- graph.edgelist(as.matrix(results.by.group[[i]][,2:3]), directed = FALSE)
#== adjust layout
l <- layout.circle(g[[i]])
#== adjust vertices
V(g[[i]])$color <- "tomato"
V(g[[i]])$frame.color <- "gray40"
V(g[[i]])$shape <- "circle"
V(g[[i]])$size <- 25
V(g[[i]])$label.color <- "white"
V(g[[i]])$label.font <- 2
V(g[[i]])$label.family <- "Helvetica"
V(g[[i]])$label.cex <- 0.7
#== adjust the edge
E(g[[i]])$weight <- as.matrix(results.by.group[[i]][, 8])
E(g[[i]])$width <- 1 + E(g[[i]])$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(results.by.group[[i]]$newrho, breaks = c(-1, 0, 1), include.lowest = TRUE)
#levels(colc)[1] <- "0"
E(g[[i]])$color <- col[colc]
g[[i]] <- delete_edges(g[[i]], which(E(g[[i]])$weight == 0))
}
adj.mat <- list()
i <- 2
for(i in 1:length(g)){
adj.mat[[i]] <- as_adjacency_matrix(g[[i]], attr = "weight")
adj.mat[[i]][is.na(adj.mat[[i]])] <- 0
assign(paste("adj.mat.", name.groups[i], sep = ""), adj.mat[[i]])
}
## Note: method with signature 'CsparseMatrix#Matrix#missing#replValue' chosen for function '[<-',
## target signature 'dgCMatrix#ngCMatrix#missing#numeric'.
## "Matrix#nsparseMatrix#missing#replValue" would also be valid
## Note: method with signature 'CsparseMatrix#Matrix#missing#replValue' chosen for function '[<-',
## target signature 'dgCMatrix#nsCMatrix#missing#numeric'.
## "Matrix#nsparseMatrix#missing#replValue" would also be valid
Now we will have nine adjacency matrix based on Spearman’s correlation of Indonesia PS1, Indonesia PS2, India PS1, India PS2, Thailand PS1, Thailand PS2, Vietnam PS1, Vietnam PS2, and Philippines PS1
#adj.mat.IDN_1, adj.mat.IDN_2, adj.mat.IND_1, adj.matIND_2, adj.mat.THA_1, adj.mat.THA2, adj.mat.VNM_1, adj.mat.VNM2, adj.mat.PHL_1
pair.IP.list <- matrix(nrow = 0, ncol = 2)
for(b in 2:(dim(IP.data)[2]-1)){
for(c in (b+1):(dim(IP.data)[2])){
new.row <- c(names(IP.data)[b], names(IP.data)[c])
pair.IP.list <- rbind(pair.IP.list, new.row)
}
}
pair.IP.list <- data.frame(data.matrix(pair.IP.list))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276
## --> row.names NOT used
names(pair.IP.list)<-c("IPvar1","IPvar2")
### Calculation of differential correlations Fisher’s z-test was used to identify significant differences between 2 correlations, based on its stringency test and its provision of conservative estimates of true differential correlations among molecules between 2 experimental conditions in the omics data. To test whether the 2 correlation coefficients were significantly different, we first transformed correlation coefficients for each of the 2 conditions, rA and rB, into \(Z_A\) and \(Z_B\), respectively. The Fisher’s transformation of coefficient \(r_A\) is defined by: \(Z = \frac{1}{2}\log{\frac{1+r_A}{1-r_A}}}\)
Similarly,
Fisher’s z-transformation of r is defined as
\(Z = \frac{Z_A - Z_B}{\sqrt{\frac{1}{n_A} + \frac{1}{n_B}}}\)
we transform coefficient rB into ZB. Differences between the two cor- relations can be tested using the following Eq. (1)nA and nB represent the sample size for each of the conditions for each biomolecule pair (Fukushima et al., 2011; Fukushima et al., 2012; Morgenthal et al., 2006). The Z value has an approximately Gaussian distribution under the null hypothesis that the population correlations are equal. Controlling the FDR described by Benjamini and Hochberg (1995) is a stringent and practical method in multiple testing problems. However, while it assumes all tests to be independent, this is not the case for correlation tests. We therefore used the local false-discovery rate (fdr) derived from the fdrtool package (Strimmer, 2008).
#cortest.normal(adj.mat[[1]], adj.mat[[2]], n1=1000, n2=1000)
diff.corr <- function(data1,n1, data2, n2, N){
cc1 <- data1
cc2 <- data2
ccc1 <- as.vector(cc1[lower.tri(cc1)])
ccc2 <- as.vector(cc2[lower.tri(cc2)])
n1 <- n1
n2 <- n2
n <- N
N <- n * (n - 1)/2
p1 <- rep(1, N)
p2 <- rep(1, N)
pdiff <- rep(1, N)
diff <- rep(1, N)
mol.names <- rownames(adj.mat[[1]])
p1 <- cor2.test(n1, ccc1)
p2 <- cor2.test(n2, ccc2)
pdiff <- compcorr(n1, ccc1, n2, ccc2)$pval
diff <- ccc1 - ccc2
myindex <- which((lower.tri(cc1)) == TRUE, arr.ind = TRUE)
mol.names1 <- mol.names[myindex[, 2]]
mol.names2 <- mol.names[myindex[, 1]]
fin.ind <- pdiff < 0.05
res <- cbind(mol.names1[fin.ind], mol.names2[fin.ind], ccc1[fin.ind], p1[fin.ind], ccc2[fin.ind], p2[fin.ind], pdiff[fin.ind], diff[fin.ind])
res <- as.data.frame(res)
names(res) <- c("var1", "var2", "r1", "p1", "r2", "p2", "p.difference", "difr")
str(res)
res$var1 <- as.character(res$var1)
res$var2 <- as.character(res$var2)
res$r1 <- as.numeric(as.character(res$r1))
res$p1 <- as.numeric(as.character(res$p1))
res$r2 <- as.numeric(as.character(res$r2))
res$p2 <- as.numeric(as.character(res$p2))
res$p.difference <- as.numeric(as.character(res$p.difference))
res$difr <- as.numeric(as.character(res$difr))
return(res)
}
res <- diff.corr(data1 = adj.mat.IDN_1, 24, data2 = adj.mat.IDN_2, 24, 100)
## 'data.frame': 2 obs. of 8 variables:
## $ var1 : Factor w/ 2 levels "rbx","shbx": 1 2
## $ var2 : Factor w/ 2 levels "rbbx","shrx": 1 2
## $ r1 : Factor w/ 1 level "0": 1 1
## $ p1 : Factor w/ 1 level "1": 1 1
## $ r2 : Factor w/ 2 levels "0.618016617766193",..: 2 1
## $ p2 : Factor w/ 2 levels "0.000975682010393041",..: 1 2
## $ p.difference: Factor w/ 2 levels "0.0163508079151282",..: 1 2
## $ difr : Factor w/ 2 levels "-0.618016617766193",..: 2 1
diff_comb <- left_join(pair.IP.list, res[, -c(3:7)], by = c("IPvar1" = "var1", "IPvar2" = "var2"))
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
diff_comb[is.na(diff_comb)] <- 0
res
## var1 var2 r1 p1 r2 p2 p.difference difr
## 1 rbx rbbx 0 1 0.6297257 0.000975682 0.01635081 -0.6297257
## 2 shbx shrx 0 1 0.6180166 0.001289351 0.01934238 -0.6180166
par(mfrow = c(1,3))
com_g <- graph.edgelist(as.matrix(diff_comb[1:2]), directed = FALSE)
#== adjust vertices
V(com_g)$color <- "tomato"
V(com_g)$frame.color <- "gray40"
V(com_g)$shape <- "circle"
V(com_g)$size <- 25
V(com_g)$label.color <- "white"
V(com_g)$label.font <- 2
V(com_g)$label.family <- "Helvetica"
V(com_g)$label.cex <- 0.7
#== adjust the edge
E(com_g)$weight <- as.matrix(diff_comb[, 3])
E(com_g)$width <- 1 + E(com_g)$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(diff_comb$difr, breaks = c(-1, 0, 1), include.lowest = TRUE)
#levels(colc)[1] <- "0"
E(com_g)$color <- col[colc]
com_g <- delete_edges(com_g, which(E(com_g)$weight == 0))
plot(g[[1]], layout = layout.circle, main = "Indonesia at PS1")
plot(com_g, layout = layout.circle, main = "Differential network \n between PS1 and PS2 ")
plot(g[[2]], layout = layout.circle, main = "Indonesia at PS2")
res2 <- diff.corr(data1 = adj.mat.IND_1, 24, data2 = adj.mat.IND_2, 24, 104)
## 'data.frame': 3 obs. of 8 variables:
## $ var1 : Factor w/ 3 levels "bphx","fsmx",..: 1 3 2
## $ var2 : Factor w/ 2 levels "fsmx","nbx": 1 2 2
## $ r1 : Factor w/ 3 levels "-0.638965963104735",..: 2 1 3
## $ p1 : Factor w/ 3 levels "0.000337521735415126",..: 3 2 1
## $ r2 : Factor w/ 2 levels "0","0.549593752955177": 2 1 1
## $ p2 : Factor w/ 2 levels "0.00540368298804368",..: 1 2 2
## $ p.difference: Factor w/ 3 levels "0.00855182620104511",..: 3 2 1
## $ difr : Factor w/ 3 levels "-0.549593752955177",..: 1 2 3
diff_comb <- left_join(pair.IP.list, res2[, -c(3:7)], by = c("IPvar1" = "var1", "IPvar2" = "var2"))
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
diff_comb[is.na(diff_comb)] <- 0
res2
## var1 var2 r1 p1 r2 p2 p.difference
## 1 bphx fsmx 0.0000000 1.0000000000 0.5495938 0.005403683 0.045295727
## 2 shbx nbx -0.6389660 0.0007767876 0.0000000 1.000000000 0.014242243
## 3 fsmx nbx 0.6704014 0.0003375217 0.0000000 1.000000000 0.008551826
## difr
## 1 -0.5495938
## 2 -0.6389660
## 3 0.6704014
par(mfrow = c(1,3))
com_g <- graph.edgelist(as.matrix(diff_comb[1:2]), directed = FALSE)
#== adjust vertices
V(com_g)$color <- "tomato"
V(com_g)$frame.color <- "gray40"
V(com_g)$shape <- "circle"
V(com_g)$size <- 25
V(com_g)$label.color <- "white"
V(com_g)$label.font <- 2
V(com_g)$label.family <- "Helvetica"
V(com_g)$label.cex <- 0.7
#== adjust the edge
E(com_g)$weight <- as.matrix(diff_comb[, 3])
E(com_g)$width <- 1 + E(com_g)$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(diff_comb$difr, breaks = c(-1, 0, 1), include.lowest = TRUE)
#levels(colc)[1] <- "0"
E(com_g)$color <- col[colc]
com_g <- delete_edges(com_g, which(E(com_g)$weight == 0))
plot(g[[3]], layout = layout.circle, main = "India at PS1")
plot(com_g, layout = layout.circle, main = "Differential network \n between PS1 and PS2 ")
plot(g[[4]], layout = layout.circle, main = "INdia at PS2")
res2 <- diff.corr(data1 = adj.mat.THA_1, 24, data2 = adj.mat.THA_2, 24, 105)
## 'data.frame': 1 obs. of 8 variables:
## $ var1 : Factor w/ 1 level "lba": 1
## $ var2 : Factor w/ 1 level "shbx": 1
## $ r1 : Factor w/ 1 level "-0.33839819964214": 1
## $ p1 : Factor w/ 1 level "0.105783730450418": 1
## $ r2 : Factor w/ 1 level "0.293145674356346": 1
## $ p2 : Factor w/ 1 level "0.16446383583604": 1
## $ p.difference: Factor w/ 1 level "0.033994983692041": 1
## $ difr : Factor w/ 1 level "-0.631543873998486": 1
diff_comb <- left_join(pair.IP.list, res2[, -c(3:7)], by = c("IPvar1" = "var1", "IPvar2" = "var2"))
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
diff_comb[is.na(diff_comb)] <- 0
res2
## var1 var2 r1 p1 r2 p2 p.difference
## 1 lba shbx -0.3383982 0.1057837 0.2931457 0.1644638 0.03399498
## difr
## 1 -0.6315439
par(mfrow = c(1,3))
com_g <- graph.edgelist(as.matrix(diff_comb[1:2]), directed = FALSE)
#== adjust vertices
V(com_g)$color <- "tomato"
V(com_g)$frame.color <- "gray40"
V(com_g)$shape <- "circle"
V(com_g)$size <- 25
V(com_g)$label.color <- "white"
V(com_g)$label.font <- 2
V(com_g)$label.family <- "Helvetica"
V(com_g)$label.cex <- 0.7
#== adjust the edge
E(com_g)$weight <- as.matrix(diff_comb[, 3])
E(com_g)$width <- 1 + E(com_g)$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(diff_comb$difr, breaks = c(-1, 0, 1), include.lowest = TRUE)
#levels(colc)[1] <- "0"
E(com_g)$color <- col[colc]
com_g <- delete_edges(com_g, which(E(com_g)$weight == 0))
plot(g[[5]], layout = layout.circle, main = "Thaialnd at PS1")
plot(com_g, layout = layout.circle, main = "Differential network \n between PS1 and PS2")
plot(g[[6]], layout = layout.circle, main = " Thailand at PS2")
res2 <- diff.corr(data1 = adj.mat.VNM_1, 24, data2 = adj.mat.VNM_2, 24, 105)
## 'data.frame': 6 obs. of 8 variables:
## $ var1 : Factor w/ 6 levels "bsa","defa","rbx",..: 6 2 5 3 1 4
## $ var2 : Factor w/ 5 levels "nbsa","nbx","rsdx",..: 3 5 1 4 2 2
## $ r1 : Factor w/ 6 levels "-0.556443296678694",..: 6 5 1 3 2 4
## $ p1 : Factor w/ 6 levels "0.000925099920001324",..: 1 3 5 6 2 4
## $ r2 : Factor w/ 3 levels "-0.205469676672969",..: 2 3 3 1 3 3
## $ p2 : Factor w/ 3 levels "0.215469830227002",..: 1 3 3 2 3 3
## $ p.difference: Factor w/ 6 levels "0.00102582911299476",..: 1 4 6 2 3 5
## $ difr : Factor w/ 6 levels "-0.556443296678694",..: 6 4 1 5 2 3
diff_comb <- left_join(pair.IP.list, res2[, -c(3:7)], by = c("IPvar1" = "var1", "IPvar2" = "var2"))
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
diff_comb[is.na(diff_comb)] <- 0
res2
## var1 var2 r1 p1 r2 p2 p.difference
## 1 whx rsdx 0.6319098 0.0009250999 -0.2623898 0.2154698 0.001025829
## 2 defa shrx 0.5822719 0.0028332224 0.0000000 1.0000000 0.030948013
## 3 wbpx nbsa -0.5564433 0.0047449481 0.0000000 1.0000000 0.041964526
## 4 rbx shbx 0.4834204 0.0167035186 -0.2054697 0.3354541 0.017102023
## 5 bsa nbx -0.5874529 0.0025417535 0.0000000 1.0000000 0.029016848
## 6 shrx nbx 0.5775572 0.0031226151 0.0000000 1.0000000 0.032783763
## difr
## 1 0.8942996
## 2 0.5822719
## 3 -0.5564433
## 4 0.6888900
## 5 -0.5874529
## 6 0.5775572
par(mfrow = c(1,3))
com_g <- graph.edgelist(as.matrix(diff_comb[1:2]), directed = FALSE)
#== adjust vertices
V(com_g)$color <- "tomato"
V(com_g)$frame.color <- "gray40"
V(com_g)$shape <- "circle"
V(com_g)$size <- 25
V(com_g)$label.color <- "white"
V(com_g)$label.font <- 2
V(com_g)$label.family <- "Helvetica"
V(com_g)$label.cex <- 0.7
#== adjust the edge
E(com_g)$weight <- as.matrix(diff_comb[, 3])
E(com_g)$width <- 1 + E(com_g)$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(diff_comb$difr, breaks = c(-1, 0, 1), include.lowest = TRUE)
#levels(colc)[1] <- "0"
E(com_g)$color <- col[colc]
com_g <- delete_edges(com_g, which(E(com_g)$weight == 0))
plot(g[[7]], layout = layout.circle , main = "Vietnam at PS1")
plot(com_g, layout = layout.circle, main = "Differential network \n between PS1 and PS2 ")
plot(g[[8]], layout = layout.circle, main = "Vietnam at PS2")
See how the yield data are distributed
#head(data)
# select only the variables related to the injury profiles
start.IP <- "dhx"
end.IP <- "rtx"
start.col.IP <- match(start.IP, names(data))
end.col.IP <- match(end.IP, names(data))
IP.data <- data[start.col.IP:end.col.IP]
IP.data <- IP.data[ ,apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
yield <- data$yield
IP.Y.data <- cbind(yield, IP.data)
IP.Y.data[is.na(IP.Y.data)] <- 0
#hist(IP.Y.data$yield)
# from this histogram
# I would group the data into six groups of yields.
ylevel <- ifelse(IP.Y.data$yield <= 3, "Y3",
ifelse( 3 < IP.Y.data$yield & IP.Y.data$yield <= 4, "Y4",
ifelse(4 < IP.Y.data$yield & IP.Y.data$yield <= 5, "Y5",
ifelse(5 < IP.Y.data$yield & IP.Y.data$yield <= 6, "Y6",
ifelse(6 < IP.Y.data$yield & IP.Y.data$yield <= 7, "Y7", "Y8"
)))))
# y3 <- subset(IP.Y.data, yield <= 3)
# y4 <- subset(IP.Y.data, 3 < yield | yield >= 4 )
# y5 <- subset(IP.Y.data, 4 < yield | yield >= 5 )
# y6 <- subset(IP.Y.data, 5 < yield | yield >= 6 )
# y7 <- subset(IP.Y.data, 6 < yield | yield >= 7 )
# y8 <- subset(IP.Y.data, 7 < yield | yield >= 8 )
IP.Y.data <- cbind(ylevel, IP.Y.data)
IP.Y.data$ylevel <- as.factor(IP.Y.data$ylevel)
ylevels <- levels(IP.Y.data$ylevel)
IP.Y.data$activity.level <- rowSums(IP.Y.data[-1])
ggplot(IP.Y.data, aes(x= as.factor(ylevel), y= activity.level)) +
geom_boxplot() +
stat_summary(fun.y = median, geom = "line", aes(group = 1)) +
stat_summary(func.y = median, geom = "point")
ggplot(IP.Y.data, aes(x= yield, y= activity.level)) + geom_point()
IP.Y.data.new <- IP.Y.data[, !names(IP.Y.data) %in% c("yield", "activity.level")]
results <- matrix(nrow = 0, ncol = 7)
options(warnings = -1)
for(a in 1:length(ylevels)){
#pull the first element from the vector of treatments
y.temp <- ylevels[a]
#subset the dataset for those treatments
temp <- subset(IP.Y.data.new, ylevel == y.temp)
#in this case the community data started at column 6, so the loop for co-occurrence has to start at that point
for(b in 2:(dim(temp)[2]-1)){
#every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
for(c in (b+1):(dim(temp)[2])){
#summing the abundances of species of the columns that will be compared
species1.ab <- sum(temp[, b])
species2.ab <- sum(temp[, c])
#if the column is all 0's no co-occurrence will be performed
if(species1.ab >1 & species2.ab >1){
test <- cor.test(temp[,b], temp[,c], method = "spearman", na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if(species1.ab <=1 | species2.ab <= 1){
rho<-0
p.value<-1
}
new.row <- c(ylevels[a], names(temp)[b], names(temp)[c], rho, p.value, species1.ab, species2.ab)
results <- rbind(results, new.row)
}
}
print(a/length(ylevels))
}
## [1] 0.1666667
## [1] 0.3333333
## [1] 0.5
## [1] 0.6666667
## [1] 0.8333333
## [1] 1
results <- data.frame(data.matrix(results))
names(results) <- c("ylevel","taxa1","taxa2","rho","p.value","ab1","ab2")
#making sure certain variables are factors
results$ylevel <- as.factor(results$ylevel)
results$taxa1 <- as.character(as.factor(results$taxa1))
results$taxa2 <- as.character(as.factor(results$taxa2))
results$rho <- as.numeric(as.character(results$rho))
results$p.value <- as.numeric(as.character(results$p.value))
results$ab1 <- as.numeric(as.character(results$ab1))
results$ab2 <- as.numeric(as.character(results$ab2))
str(results)
## 'data.frame': 1656 obs. of 7 variables:
## $ ylevel : Factor w/ 6 levels "Y3","Y4","Y5",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ taxa1 : chr "dhx" "dhx" "dhx" "dhx" ...
## $ taxa2 : chr "whx" "ssx" "defa" "bphx" ...
## $ rho : num 0.1425 0.3869 0.2996 0.0659 -0.1179 ...
## $ p.value: num 0.5165 0.0682 0.1648 0.7652 0.5921 ...
## $ ab1 : num 282 282 282 282 282 282 282 282 282 282 ...
## $ ab2 : num 1243 208 52 1109 83 ...
head(results)
## ylevel taxa1 taxa2 rho p.value ab1 ab2
## 1 Y3 dhx whx 0.14253716 0.51647489 282 1243
## 2 Y3 dhx ssx 0.38687783 0.06819797 282 208
## 3 Y3 dhx defa 0.29964515 0.16481128 282 52
## 4 Y3 dhx bphx 0.06588643 0.76518142 282 1109
## 5 Y3 dhx wbpx -0.11789000 0.59214825 282 83
## 6 Y3 dhx awx 0.00000000 1.00000000 282 0
# sub_y.results <- subset(results, as.numeric(as.character(abs(rho))) > 0.25)
sub_y.results <- results %>% filter(p.value < 0.05)
sub_ylevels.results <- list()
for(i in 1: length(ylevels)){
sub_ylevels.results[[i]] <- subset(sub_y.results, ylevel == ylevels[i])
}
# head(results_sub.by.group[[1]][,2:3]) # get the list
par(mfrow = c(2, 3))
ynet <- list()
for(i in 1:length(ylevels)){
ynet[[i]] <- graph.edgelist(as.matrix(sub_ylevels.results[[i]][, 2:3]), directed = FALSE)
#== adjust layout
l <- layout.circle(ynet[[i]])
#== adjust vertices
V(ynet[[i]])$color <- "tomato"
V(ynet[[i]])$frame.color <- "gray40"
V(ynet[[i]])$shape <- "circle"
V(ynet[[i]])$size <- 25
V(ynet[[i]])$label.color <- "white"
V(ynet[[i]])$label.font <- 2
V(ynet[[i]])$label.family <- "Helvetica"
V(ynet[[i]])$label.cex <- 0.7
#== adjust the edge
E(ynet[[i]])$weight <- as.matrix(sub_ylevels.results[[i]][, 4])
E(ynet[[i]])$width <- 1 + E(ynet[[i]])$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(sub_ylevels.results[[i]]$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
E(ynet[[i]])$color <- col[colc]
#== plot network model
plot(ynet[[i]], layout = l * 1.0, main = paste( "network model of", ylevels[i]))
}
network.value <- list()
i <-1
for(i in 1:length(ynet)){
E(ynet[[i]])$weight <- abs(as.matrix(sub_ylevels.results[[i]][, 4]))
network.value[[i]] <- data.frame(
ylevel = ylevels[i],
node = V(ynet[[i]])$name,
strength = graph.strength(ynet[[i]]),
betweenness = betweenness(ynet[[i]]),
closeness = closeness(ynet[[i]]),
eigenvector = evcent(ynet[[i]])$vector,
clusterCoef = transitivity(ynet[[i]], type=c("local"))
)
}
network.value <- as.data.frame(do.call("rbind", network.value))
row.names(network.value ) <- NULL
p1 <- ggplot(network.value, aes(x = ylevel, y = closeness)) + geom_boxplot()
p2 <- ggplot(network.value, aes(x = ylevel, y = betweenness)) + geom_boxplot()
p3 <- ggplot(network.value, aes(x = ylevel, y = strength)) + geom_boxplot()
p4 <- ggplot(network.value, aes(x = ylevel, y = eigenvector)) + geom_boxplot()
p5 <- ggplot(network.value, aes(x = ylevel, y = clusterCoef)) + geom_boxplot()
p6 <- ggplot(IP.Y.data, aes(x= as.factor(ylevel), y= activity.level)) +
# geom_boxplot() +
stat_summary(fun.y = median, geom = "line", aes(group = 1))
#stat_summary(func.y = median, geom = "point")
grid.arrange(p1, p2, p3, p4, p5, p6)
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).